In [1]:
import pandas as pd
import numpy as np

import pandas as pd
import re
import string

import matplotlib.pyplot as plt
import openpyxl

import tqdm

SEED = 123456789

Input Data & Preprocess¶

สวรส.

In [2]:
# input สวรส.
DATA_DIR_HRSI = "../BERTOPIC-HEALTHY-RESERCH/DATASET/hsri.xlsx"
hsri = pd.read_excel(DATA_DIR_HRSI)
hsri.info()
print('\nNull')
hsri.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5909 entries, 0 to 5908
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   index     5909 non-null   int64 
 1   itemset   5909 non-null   int64 
 2   title     1914 non-null   object
 3   names     1914 non-null   object
 4   date      1914 non-null   object
 5   abstract  1913 non-null   object
dtypes: int64(2), object(4)
memory usage: 277.1+ KB

Null
Out[2]:
index          0
itemset        0
title       3995
names       3995
date        3995
abstract    3996
dtype: int64
In [3]:
# input bmc
DATA_DIR_BMC = "../BERTOPIC-HEALTHY-RESERCH/DATASET/bmc.xlsx"
bmc = pd.read_excel(DATA_DIR_BMC)
bmc.info()
print('\nNull')
bmc.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   index     1461 non-null   int64 
 1   pages     1461 non-null   int64 
 2   title     1429 non-null   object
 3   names     1429 non-null   object
 4   date      1429 non-null   object
 5   abstract  1429 non-null   object
 6   url       1461 non-null   object
dtypes: int64(2), object(5)
memory usage: 80.0+ KB

Null
Out[3]:
index        0
pages        0
title       32
names       32
date        32
abstract    32
url          0
dtype: int64
In [4]:
# concatenate research abstracts from 2 sources into one dataframe.
df = pd.concat([hsri,bmc]).iloc[:,2:6]
df['abstract'] = df['abstract'].astype(str).apply(openpyxl.utils.escape.unescape)
df
Out[4]:
title names date abstract
0 NaN NaN NaN nan
1 Pufferfish Poisoning เปี่ยมศักดิ์ เมนุเศวต 2550 Pufferfish belong to two families of marine an...
2 A Study of a Refill Prescription Service Syste... ระพีพรรณ ฉลองสุข 2550 The refilling of prescriptions for patients wi...
3 Evaluation of the Impact of Local Wisdom on Sa... วิทยา เมฆขำ 2550 A study was carried out to evaluate the impact...
4 Looking at Health Promotion and Disease Preven... ประคิณ สุจฉายา 2550 Children under five years old normally grow an...
... ... ... ... ...
1456 A contingent valuation study to estimate the p... Amin Mo 24 June 2004 We used contingent valuation technique to esti...
1457 Readiness to change physical activity and diet... Taylor Wendell C 10 June 2004 BackgroundComplementary or discrepant stages o...
1458 "Harnessing genomics to improve health in Indi... Acharya Tara 19 May 2004 BackgroundThe benefits of scientific medicine ...
1459 The utilisation of health research in policy-m... Hanney Stephen R 13 January 2003 The importance of health research utilisation ...
1460 Assessing capacity for health policy and syste... Gonzalez Block Miguel A 13 January 2003 BackgroundAs demand grows for health policies ...

7370 rows × 4 columns

In [5]:
df.isnull().sum()
Out[5]:
title       4027
names       4027
date        4027
abstract       0
dtype: int64
In [6]:
# drop null
df = df.dropna()
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'>
Index: 3343 entries, 1 to 1460
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     3343 non-null   object
 1   names     3343 non-null   object
 2   date      3343 non-null   object
 3   abstract  3343 non-null   object
dtypes: object(4)
memory usage: 130.6+ KB
Out[6]:
title names date abstract
1 Pufferfish Poisoning เปี่ยมศักดิ์ เมนุเศวต 2550 Pufferfish belong to two families of marine an...
2 A Study of a Refill Prescription Service Syste... ระพีพรรณ ฉลองสุข 2550 The refilling of prescriptions for patients wi...
3 Evaluation of the Impact of Local Wisdom on Sa... วิทยา เมฆขำ 2550 A study was carried out to evaluate the impact...
4 Looking at Health Promotion and Disease Preven... ประคิณ สุจฉายา 2550 Children under five years old normally grow an...
6 Over-crowding Problems in Hospitals สุพัตรา ศรีวณิชชากร 2550 Overcrowding in hospitals’ out-patient sectors...
In [7]:
#word_tokenize => delete stop_words

from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize

# func. delete stopwords
def clear_stopword(sentence_list):
    return [i for i in sentence_list if i not in stopwords.words('english')]

# define all bstract to list
abstracts = []
for sentence in df['abstract'].values:
    abstracts.append(sentence.lower())

# tokenize words & delete stopwords
abstracts = [' '.join(clear_stopword(word_tokenize(i))) for i in abstracts]

Create Embedding Model¶

In [8]:
# define device is "GPU"
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
cuda
In [9]:
from sentence_transformers import SentenceTransformer

# embedding abstract 
sentence_model = SentenceTransformer("all-MiniLM-L6-v2", device=device) # paraphrase-multilingual-MiniLM-L12-v2
embeddings = sentence_model.encode(abstracts, batch_size=64, show_progress_bar=True)
C:\Users\acer\AppData\Local\Programs\Python\Python310\lib\site-packages\tqdm\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 53/53 [00:24<00:00,  2.17it/s]
In [10]:
embeddings.shape
Out[10]:
(3343, 384)

Create Topic Model¶

In [11]:
from umap import UMAP

# defind UMAP model 
# use UMAP to reduce dimensions of the embeddings data from kernel [10] : (3343,384) --> (3343,2)
n_neighbors = 60
min_dist = 0.1
umap_model = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine',verbose=False,random_state=SEED)
In [12]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# model vectorize abstract 
vectorizer_model = CountVectorizer(token_pattern="\S+",)

# define bertopic model
topic_model = BERTopic(#embedding_model=sentence_model,
                       umap_model=umap_model,
                       vectorizer_model=vectorizer_model,
                       language='english',#"thai",
                       #representation_model=representation_model,
                       verbose=0,)
                       #min_topic_size=50)

topics, probs = topic_model.fit_transform(abstracts,embeddings)
In [13]:
# get first 10 topic
token_word = topic_model.get_topic_info()
token_word.iloc[:12,:3]
Out[13]:
Topic Count Name
0 -1 961 -1_health_study_care_system
1 0 1160 0_research_health_evidence_policy
2 1 84 1_health_primary_care_service
3 2 69 2_diabetes_diabetic_patients_blood
4 3 68 3_drug_drugs_price_pharmaceutical
5 4 65 4_cost_hospitals_hospital_per
6 5 62 5_covid19_care_people_infection
7 6 58 6_scheme_coverage_health_care
8 7 55 7_hiv_policy_research_sexual
9 8 50 8_tb_tuberculosis_patients_treatment
10 9 50 9_water_farmers_environmental_pollution
11 10 49 10_elderly_care_older_welfare
In [14]:
fig = topic_model.visualize_topics(width=1920, height=1080)
fig.write_html("de-topic.html")
topic_model.visualize_barchart()

Visualization¶

In [15]:
# transform embeddings data (data from kernel [9])
project_emb_umap = umap_model.fit_transform(embeddings)

Visual by BERTopic¶

In [16]:
topic_model.visualize_documents(abstracts, reduced_embeddings=project_emb_umap, custom_labels=True)

Mannual Visual¶

In [17]:
plt.title(f'UMAP Projected Embeddings of abstract HSRI. & BMC')
plt.scatter(project_emb_umap[:, 0], project_emb_umap[:, 1], s=1, alpha=0.2)
plt.show()

Visual by Wizmap¶

https://github.com/poloclub/wizmap.git

In [18]:
import wizmap

xs = project_emb_umap[:,0].astype(float).tolist()
ys = project_emb_umap[:,1].astype(float).tolist()

# generate data and grid prepare before using wizmap
data_list = wizmap.generate_data_list(xs, ys, df['abstract'].values)
grid_dict = wizmap.generate_grid_dict(xs, ys, df['abstract'].values, 'abstracts of healthy system research')

# Save the JSON files
wizmap.save_json_files(data_list, grid_dict, output_dir='../BERTOPIC-HEALTHY-RESERCH/DATASET/WIZMAP_DATASET/HSRI_BMC/')
Start generating data list...
Start generating contours...
Start generating multi-level summaries...
3343it [00:00, 145406.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.70it/s]
In [20]:
from IPython.display import IFrame
"""
web demo : https://poloclub.github.io/wizmap

"""
# i store grid.json & data.ndjson in github
grid_url = 'https://raw.githubusercontent.com/Dont-HurtMe/Visualization-Topic-model/main/DATASET/DATA_WIZMAP/grid.json'
data_url = 'https://raw.githubusercontent.com/Dont-HurtMe/Visualization-Topic-model/main/DATASET/DATA_WIZMAP/data.ndjson'

# wizmap web demo 
'''
full web page : https://poloclub.github.io/wizmap/
?dataURL=https%3A%2F%2Fraw.githubusercontent.com%2FDont-HurtMe%2FVisualization-Topic-model%2Fmain%2FDATASET%2FDATA_WIZMAP%2Fdata.ndjson
&gridURL=https%3A%2F%2Fraw.githubusercontent.com%2FDont-HurtMe%2FVisualization-Topic-model%2Fmain%2FDATASET%2FDATA_WIZMAP%2Fgrid.json

'''
display_url = f'https://poloclub.github.io/wizmap/?dataURL={data_url}&gridURL={grid_url}'

# use IPython to display web demo of wizmap 
IFrame(display_url, width=1470, height=720)
Out[20]:
In [21]:
# use wizmap func to display.
wizmap.visualize(data_url, grid_url, height=700)